# read in the data and rename the column name to align with cloud data in the data dictionaries
image1<- as.data.frame(read.table("image_data/image1.txt"))
image1$picture<- rep("image1", nrow(image1))
colnames(image1) [1:11] <- c("y_coordinate", "x_coordinate", "label", "NDAI", "SD", "CORR", "DF_angle", "CF_angle" , "BF_angle", "AF_angle", "AN_angle")
image2<- as.data.frame(read.table("image_data/image2.txt"))
image2$picture<- rep("image2", nrow(image2))
colnames(image2) [1:11] <- c("y_coordinate", "x_coordinate", "label", "NDAI", "SD", "CORR", "DF_angle", "CF_angle" , "BF_angle", "AF_angle", "AN_angle")
image3<- as.data.frame(read.table("image_data/image3.txt"))
image3$picture<- rep("image3", nrow(image3))
colnames(image3) [1:11] <- c("y_coordinate", "x_coordinate", "label", "NDAI", "SD", "CORR", "DF_angle", "CF_angle" , "BF_angle", "AF_angle", "AN_angle")
total_df <- rbind(image1, image2, image3)
ggplot(image1)+ geom_point(alpha = 0.5, aes(x= x_coordinate, y = y_coordinate,
color = factor(label)))+
xlab(" x coordinate") +
ylab(" y coordinate") +
ggtitle("Coordinate Map image1 with label ")+
theme_bw()
ggplot(image2)+ geom_point(alpha = 0.5, aes(x= x_coordinate, y = y_coordinate,
color = factor(label)))+
xlab(" x coordinate") +
ylab(" y coordinate") +
ggtitle("Coordinate Map image2 with label ")+
theme_bw()
ggplot(image3)+ geom_point(alpha = 0.5, aes(x= x_coordinate, y = y_coordinate,
color = factor(label)))+
xlab(" x coordinate") +
ylab(" y coordinate") +
ggtitle("Coordinate Map image3 with label ")+
theme_bw()
#Check if there's any missing data
sum(is.na(total_df))
## [1] 0
#There is no missing value
summary(total_df)
## y_coordinate x_coordinate label NDAI
## Min. : 2.0 Min. : 65.0 Min. :-1.0000 Min. :-1.8420
## 1st Qu.: 98.0 1st Qu.:143.0 1st Qu.:-1.0000 1st Qu.:-0.4286
## Median :193.0 Median :218.0 Median : 0.0000 Median : 1.3476
## Mean :193.1 Mean :218.1 Mean :-0.1334 Mean : 1.0847
## 3rd Qu.:289.0 3rd Qu.:294.0 3rd Qu.: 0.0000 3rd Qu.: 2.3142
## Max. :383.0 Max. :369.0 Max. : 1.0000 Max. : 4.5639
## SD CORR DF_angle CF_angle
## Min. : 0.1987 Min. :-0.3872 Min. : 45.28 Min. : 31.19
## 1st Qu.: 1.6376 1st Qu.: 0.1253 1st Qu.:244.56 1st Qu.:219.27
## Median : 4.3095 Median : 0.1603 Median :281.91 Median :259.31
## Mean : 8.0633 Mean : 0.1860 Mean :271.36 Mean :246.37
## 3rd Qu.: 10.2264 3rd Qu.: 0.2231 3rd Qu.:300.39 3rd Qu.:279.59
## Max. :117.5810 Max. : 0.8144 Max. :410.53 Max. :360.68
## BF_angle AF_angle AN_angle picture
## Min. : 24.49 Min. : 21.07 Min. : 20.57 Length:345556
## 1st Qu.:200.79 1st Qu.:185.16 1st Qu.:174.88 Class :character
## Median :236.17 Median :211.54 Median :197.58 Mode :character
## Mean :224.20 Mean :201.71 Mean :188.29
## 3rd Qu.:258.62 3rd Qu.:235.15 3rd Qu.:216.80
## Max. :335.08 Max. :318.70 Max. :306.93
#Calculate the proportion
percentage_label<- total_df %>%
group_by(picture, label) %>%
summarise (n = n()) %>%
mutate(freq = n / sum(n))
percentage_label
## # A tibble: 9 x 4
## # Groups: picture [3]
## picture label n freq
## <chr> <dbl> <int> <dbl>
## 1 image1 -1 50446 0.438
## 2 image1 0 44312 0.385
## 3 image1 1 20471 0.178
## 4 image2 -1 42882 0.373
## 5 image2 0 32962 0.286
## 6 image2 1 39266 0.341
## 7 image3 -1 33752 0.293
## 8 image3 0 60221 0.523
## 9 image3 1 21244 0.184
g1 <- ggplot(data=percentage_label[percentage_label$picture=='image1',],
aes(x=label, y=freq)) +
geom_bar(stat="identity", color="black", fill="lightskyblue3")+
ylab("Proportion")+
ylim(0,0.6)
g2 <- ggplot(data=percentage_label[percentage_label$picture=='image2',],
aes(x=label, y=freq)) +
geom_bar(stat="identity", color="black", fill="lightskyblue3")+
ylab(NULL)+
ylim(0,0.6)
g3 <- ggplot(data=percentage_label[percentage_label$picture=='image3',],
aes(x=label, y=freq)) +
geom_bar(stat="identity", color="black", fill="lightskyblue3")+
ylab(NULL)+
ylim(0,0.6)
grid.arrange(g1,g2,g3,ncol=3,top = textGrob("Proportion of label in each image",
gp = gpar(fontsize = 16)))
labels<- factor(total_df$label)
cor(total_df[,c(-12)])
## y_coordinate x_coordinate label NDAI SD
## y_coordinate 1.000000000 -0.006376002 -0.213372321 -0.3276781 -0.2646997
## x_coordinate -0.006376002 1.000000000 -0.465822566 -0.5231960 -0.3233889
## label -0.213372321 -0.465822566 1.000000000 0.6169346 0.2954477
## NDAI -0.327678096 -0.523195958 0.616934624 1.0000000 0.6310626
## SD -0.264699672 -0.323388894 0.295447745 0.6310626 1.0000000
## CORR -0.254309102 -0.361793127 0.444059231 0.4034998 0.2968385
## DF_angle 0.378983970 0.081274648 0.006550085 -0.1610916 -0.2061691
## CF_angle 0.472777594 0.269869879 -0.208279170 -0.3622113 -0.3688601
## BF_angle 0.520812966 0.339539816 -0.337948500 -0.4629301 -0.4404404
## AF_angle 0.530441793 0.368160603 -0.389741017 -0.4927484 -0.4555423
## AN_angle 0.515677887 0.383211616 -0.389358825 -0.4895267 -0.4466229
## CORR DF_angle CF_angle BF_angle AF_angle
## y_coordinate -0.2543091 0.378983970 0.4727776 0.5208130 0.5304418
## x_coordinate -0.3617931 0.081274648 0.2698699 0.3395398 0.3681606
## label 0.4440592 0.006550085 -0.2082792 -0.3379485 -0.3897410
## NDAI 0.4034998 -0.161091626 -0.3622113 -0.4629301 -0.4927484
## SD 0.2968385 -0.206169130 -0.3688601 -0.4404404 -0.4555423
## CORR 1.0000000 0.126283481 -0.1660966 -0.4311043 -0.6039353
## DF_angle 0.1262835 1.000000000 0.8495716 0.6991073 0.5910958
## CF_angle -0.1660966 0.849571562 1.0000000 0.9119430 0.8216176
## BF_angle -0.4311043 0.699107255 0.9119430 1.0000000 0.9529627
## AF_angle -0.6039353 0.591095794 0.8216176 0.9529627 1.0000000
## AN_angle -0.6820440 0.547609821 0.7727499 0.9043409 0.9706198
## AN_angle
## y_coordinate 0.5156779
## x_coordinate 0.3832116
## label -0.3893588
## NDAI -0.4895267
## SD -0.4466229
## CORR -0.6820440
## DF_angle 0.5476098
## CF_angle 0.7727499
## BF_angle 0.9043409
## AF_angle 0.9706198
## AN_angle 1.0000000
corrplot.mixed(cor(total_df[,c(-3, -12)]))
G<- ggplot(total_df)+geom_density(aes(x= NDAI, group= label, color=labels, fill= labels), color= "black",alpha = 0.7)+
ggtitle("Overlaying histogram of NDAI based on labels")+
theme_minimal()
H<- ggplot(total_df)+geom_density(aes(x= log(SD), group= label, color=labels, fill= labels), color= "black",alpha = 0.7)+
ggtitle("Overlaying histogram of Log SD based on labels")+
theme_minimal()
I<- ggplot(total_df)+geom_density(aes(x= CORR, group= label, color=labels, fill= labels), color= "black",alpha = 0.7)+
ggtitle("Overlaying histogram of CORR based on labels")+
theme_minimal()
A<- ggplot(total_df)+geom_density(aes(x= DF_angle, group= label, color=labels, fill= labels), color= "black",alpha = 0.7)+
ggtitle("Overlaying histogram of DF_angle based on labels")+
theme_minimal()
B<- ggplot(total_df)+geom_density(aes(x= CF_angle, group= label, color=labels, fill= labels), color= "black",alpha = 0.7)+
ggtitle("Overlaying histogram of CF_angle based on labels")+
theme_minimal()
C<- ggplot(total_df)+geom_density(aes(x= BF_angle, group= label, color=labels, fill= labels), color= "black",alpha = 0.7)+
ggtitle("Overlaying histogram of BF_angle based on labels")+
theme_minimal()
D<-ggplot(total_df)+geom_density(aes(x= AF_angle, group= label, color=labels, fill= labels), color= "black",alpha = 0.7)+
ggtitle("Overlaying histogram of AF_angle based on labels")+
theme_minimal()
E<- ggplot(total_df)+geom_density(aes(x= AN_angle, group= label, color=labels, fill= labels), color= "black",alpha = 0.7)+
ggtitle("Overlaying histogram of AN_angle based on labels")+
theme_minimal()
plot_grid(B,C,D,E,nrow=2, align="h")
plot_grid(A,G,H,I,nrow=2, align="h")
image1<- image1 %>% filter(label !=0)
image2<- image2 %>% filter(label !=0)
image3<- image3 %>% filter(label !=0)
traintest_split<- function(data){
res<-list()
trainIndex <- createDataPartition(data$label, p = .8,
list = FALSE,
times = 1)
res$train<- data[ trainIndex,]
res$test<- data[-trainIndex,]
return(res)
}
trainval_split<- function(data){
res<-list()
valIndex <- createDataPartition(data$label, p = .2,
list = FALSE,
times = 1)
res$val<- data[ valIndex,]
res$train<- data[-valIndex,]
return(res)
}
##image 1
#Test-train split
train1_label_split1<- traintest_split(image1)$train
# get test from train-test split
test_label_split1<- traintest_split(image1)$test
#Train-val split ---get val
val_label_split1<- trainval_split(train1_label_split1)$val
#Get train
train_label_split1<-trainval_split(train1_label_split1)$train
##image 2
train2_label_split2<- traintest_split(image2)$train
# get test from train-test split
test_label_split2<- traintest_split(image2)$test
#Train-val split ---get val
val_label_split2<- trainval_split(train2_label_split2)$val
#Get train
train_label_split2<-trainval_split(train2_label_split2)$train
##image 3
train3_label_split3<- traintest_split(image3)$train
# get test from train-test split
test_label_split3<- traintest_split(image3)$test
#Train-val split ---get val
val_label_split3<- trainval_split(train3_label_split3)$val
#Get train
train_label_split3<-trainval_split(train3_label_split3)$train
method1_train<-rbind(train_label_split1,train_label_split2,train_label_split3)
method1_val<-rbind(val_label_split1,val_label_split2,val_label_split3)
method1_test<- rbind(test_label_split1,test_label_split2,test_label_split3)
method1_train_val<-rbind(method1_train,method1_val)
method1_train_val$label<-factor(method1_train_val$label)
#Check Duplicates
image1 %>% distinct()
grid_row <- 10
grid_col <- 10
datasplit<- function(grid_row, grid_col, image){
ret <- list()
train_data <- data.frame()
val_data <- data.frame()
test_data <- data.frame()
min_y_cor <- min(image$y_coordinate)
min_x_cor <- min(image$x_coordinate)
max_y_cor <- max(image$y_coordinate)
max_x_cor <- max(image$x_coordinate)
divide_row <- seq(min_y_cor,max_y_cor+1,(max_y_cor+1-min_y_cor)/(grid_row-1))
divide_col <- seq(min_x_cor,max_x_cor+1,(max_x_cor+1-min_x_cor)/(grid_col-1))
for (i in 1:length(divide_row)){
for (j in 1:length(divide_col)){
chunk <- image[image$y_coordinate>=divide_row[i] & image$y_coordinate<divide_row[i+1] &
image$x_coordinate>=divide_col[j] & image$x_coordinate<divide_col[j+1], ]
test_and_val <- floor(nrow(chunk)*0.4)
test_and_val_ind <- sample(seq_len(nrow(chunk)), size = test_and_val)
val_size <- floor(test_and_val/2)
val_ind_ind <- sample(length(test_and_val_ind), size = val_size)
val_ind <- test_and_val_ind[val_ind_ind]
test_ind <- test_and_val_ind[-val_ind_ind]
test<- chunk[test_ind, ]
val<- chunk[val_ind, ]
train<- chunk[-test_and_val_ind, ]
train_data = rbind(train_data,train)
test_data = rbind(test_data,test)
val_data = rbind(val_data,val)
}
}
ret$training <- train_data
ret$validation <- val_data
ret$test <- test_data
return(ret)
}
split_result_1 <- datasplit(10,10,image1)
split_result_2 <- datasplit(10,10,image2)
split_result_3 <- datasplit(10,10,image3)
train_total<- rbind(split_result_1$training,split_result_2$training,split_result_3$training)
val_total<- rbind(split_result_1$validation,split_result_2$validation,split_result_3$validation)
test_total<- rbind(split_result_1$test,split_result_2$test,split_result_3$test)
train_total$picture<- NULL
val_total$picture<- NULL
test_total$picture<-NULL
train_total$label<-factor(train_total$label)
test_trivial<- test_total
test_trivial$label<--1
train_trivial<- train_total
train_trivial$label<--1
val_trivial<- val_total
val_trivial$label<--1
sum(test_trivial$label ==test_total$label)/length(test_total$label)
## [1] 0.6104143
sum(val_trivial$label ==val_total$label)/length(val_trivial$label)
## [1] 0.6107753
sum(train_trivial$label ==train_total$label)/length(train_trivial$label)
## [1] 0.6109172
train_val<- rbind(train_total, val_total)
train_val$label<- as.factor(train_val$label)
test_total$label<- as.factor(test_total$label)
#visual
A<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= y_coordinate, color= factor(label)))+ theme_bw()+ ggtitle("y_coordinate and label")+
xlab("label")+theme(plot.title = element_text(size = rel(1), vjust = 1.5,face="bold.italic"))+
theme(legend.position="none")
B<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= x_coordinate,color= factor(label)))+ theme_bw()+ ggtitle("x_coordinate and label")+theme(legend.position="none")+
xlab("label")+theme(plot.title = element_text(size = rel(1), vjust = 1.5,face="bold.italic"))
C<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= NDAI,color= factor(label)))+ theme_bw()+
ggtitle("label and NDAI")+theme(legend.position="none")+
xlab("label")+theme(plot.title = element_text(size = rel(1), vjust = 1.5,face="bold.italic"))
D<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= SD,color= factor(label)))+ theme_bw()+
ggtitle("label and SD")+theme(legend.position="none")+
xlab("label")+theme(plot.title = element_text(size = rel(1), vjust = 1.5,face="bold.italic"))
E<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= CORR,color= factor(label)))+ theme_bw()+
ggtitle("label and CORR")+theme(legend.position="none")+
xlab("label")+theme(plot.title = element_text(size = rel(1), vjust = 1.5,face="bold.italic"))
G<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= DF_angle,color= factor(label)))+ theme_bw()+ theme(legend.position="none")+
ggtitle("label and DF_angle")+
xlab("label")+theme(plot.title = element_text(size = rel(1), vjust = 1.5,face="bold.italic"))
H<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= CF_angle,color= factor(label)))+ theme_bw()+ theme(legend.position="none")+
ggtitle("label and CF_angle")+
xlab("label")+theme(plot.title = element_text(size = rel(1), vjust = 1.5,face="bold.italic"))
I<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= BF_angle,color= factor(label)))+ theme_bw()+ theme(legend.position="none")+
ggtitle("label and BF_angle")+
xlab("label")+theme(plot.title = element_text(size = rel(1), vjust = 1.5,face="bold.italic"))
J<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= AF_angle,color= factor(label)))+ theme_bw()+ theme(legend.position="none")+
ggtitle("label and AF_angle")+
xlab("label")+theme(plot.title = element_text(size = rel(1), vjust = 1.5,face="bold.italic"))
K<-I<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= AN_angle,color= factor(label)))+ theme_bw()+
ggtitle("label and AN_angle")+theme(legend.position="none")+
xlab("label")+theme(plot.title = element_text(size = rel(1), vjust = 1.5,face="bold.italic"))
plot_grid(A, B,C,D,E,G,H,I,J,K,nrow=3, align="h")
#quantitative
summary(lm(as.numeric(label)~NDAI, data = train_val))$r.squared
## [1] 0.5759593
summary(lm(as.numeric(label)~CORR, data = train_val))$r.squared
## [1] 0.3043421
summary(lm(as.numeric(label)~AF_angle, data = train_val))$r.squared
## [1] 0.2584373
summary(lm(as.numeric(label)~x_coordinate, data = train_val))$r.squared
## [1] 0.3259248
summary(lm(as.numeric(label)~y_coordinate, data = train_val))$r.squared
## [1] 0.07891846
summary(lm(as.numeric(label)~SD, data = train_val))$r.squared
## [1] 0.1907891
summary(lm(as.numeric(label)~DF_angle, data = train_val))$r.squared
## [1] 9.926802e-05
summary(lm(as.numeric(label)~CF_angle, data = train_val))$r.squared
## [1] 0.08043943
summary(lm(as.numeric(label)~AN_angle, data = train_val))$r.squared
## [1] 0.255765
library("dvmisc")
## Warning: package 'dvmisc' was built under R version 3.5.2
## Loading required package: rbenchmark
#logistic regression
seed = 123
K = 5
source("CVgeneric.R")
cv_result <- CVgeneric("logistic", c("y_coordinate", "x_coordinate", "NDAI", "SD", "CORR", "DF_angle", "CF_angle", "BF_angle", "AF_angle", 'AN_angle'), "label", K, data=train_val, loss= accuracy, seed)
set.seed(cv_result$seed)
cv_result_method_2 <- CVgeneric("logistic", c("y_coordinate", "x_coordinate", "NDAI", "SD", "CORR", "DF_angle", "CF_angle", "BF_angle", "AF_angle", 'AN_angle'), "label", K, data=method1_train_val, loss= accuracy, seed)
set.seed(cv_result$seed)
#folds = createFolds(train_val[,labels], k = cv_result$num_of_folds)
train_data = train_val[-cv_result$index, ]
logistic_model<- train(label ~ ., data=train_data, method="glm", family="binomial")
#test accuracy
predicted.classes <- logistic_model %>% predict(test_total[,-3])
mean(predicted.classes == test_total$label)
## [1] 0.8959295
library("MLmetrics")
##
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
## The following object is masked from 'package:base':
##
## Recall
#Other error metrics
#F1 Score
F1_Score(as.numeric(predicted.classes), as.numeric(test_total$label))
## [1] 0.9142868
#Sensitivity
Sensitivity(as.numeric(predicted.classes), as.numeric(test_total$label))
## [1] 0.9193266
# After inspecting the model importance, we take out the x and y coordinate and fit the logistic model again
predictions<- data.frame(predict(logistic_model, newdata =test_total[,-3],
type= "prob" ))
colnames(predictions) <- c(-1,1)
pred <- prediction(predictions$`1`, test_total$label)
perf <- performance(pred, "acc")
plot(perf, avg= "vertical", spread.estimate="boxplot",
show.spread.at= seq(0.1, 0.9, by=0.1),
main="Logistic regression cutoff and performance tradeoff")
# The label 1 takes 40%, now I have the data set....
index<-which.max(slot(perf, "y.values")[[1]])
max<-slot(perf, "x.values")[[1]][index]
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=T, print.cutoffs.at=c(max),
text.adj=c(1.5,0.2),
avg="threshold", lwd=3,
main= "Roc curve for logistic regression")
abline(a=0,b=1)
#### Model 2: LDA
source("CVgeneric.R")
cv_result_lda<-CVgeneric("lda", c("y_coordinate", "x_coordinate", "NDAI", "SD", "CORR", "DF_angle", "CF_angle", "BF_angle", "AF_angle", 'AN_angle'), "label", K = 5, train_val, loss = precision, seed)
cv_result_lda_method_2<-CVgeneric("lda", c("y_coordinate", "x_coordinate", "NDAI", "SD", "CORR", "DF_angle", "CF_angle", "BF_angle", "AF_angle", 'AN_angle'), "label", K = 5, method1_train_val, loss = precision, seed)
train_data = train_val[-cv_result_lda$index, ]
lda.model = lda(factor(label)~., data=train_val)
lda_pred<-predict(lda.model, newdata=test_total[,-3])
lda_pre2<-predict(lda.model, newdata=test_total[,-3], type= "prob")
#Other error metrics
#F1 Score
F1_Score(as.numeric(lda_pred$class), as.numeric(test_total$label))
## [1] 0.9164573
#Sensitivity
Sensitivity(as.numeric(lda_pred$class), as.numeric(test_total$label))
## [1] 0.9285281
#Accuracy
mean(lda_pred$class == test_total$label)
## [1] 0.8993176
predictions<- data.frame(predict(lda.model, newdata =test_total[,-3] ))
colnames(predictions) <- c("label",-1,1)
pred <- prediction(predictions$`1`, test_total$label)
perf <- performance(pred, "acc")
plot(perf, avg= "vertical", spread.estimate="boxplot", show.spread.at= seq(0.1, 0.9, by=0.1), main="LDA cutoff and performance tradeoff")
# The label 1 takes 40%, now I have the data set....
index<-which.max(slot(perf, "y.values")[[1]])
max<-slot(perf, "x.values")[[1]][index]
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=T, print.cutoffs.at=c(max), text.adj=c(1,0), avg="threshold", lwd=3, main= "LDA curve for ROC")
abline(a=0,b=1)
method1_train_val$picture<-NULL
# Load CART packages
seed = 123
K = 5
cv_result_decision_tree <- CVgeneric("decision tree", c("y_coordinate", "x_coordinate", "NDAI", "SD", "CORR", "DF_angle", "CF_angle", "BF_angle", "AF_angle", 'AN_angle'), "label", K, data=train_val, loss= accuracy, seed)
train_data = train_val[-cv_result_decision_tree$index, ]
tree = rpart(label ~ ., data=train_data,maxdepth =10, minsplit= 10)
tree_no_xy = rpart(label ~ NDAI+SD+CORR+DF_angle+CF_angle+BF_angle+AF_angle+AN_angle , data=train_data,maxdepth =10, minsplit= 10)
fancyRpartPlot(tree_no_xy)
tree.pred = predict(tree, newdata=test_total[,-3],method = 'response')
tree.pred2 = predict(tree, newdata=test_total[,-3],type = 'class')
mean(tree.pred2==test_total$label)
## [1] 0.922674
fancyRpartPlot(tree)
#Other error metrics
#F1 Score
F1_Score(as.numeric(tree.pred2), as.numeric(test_total$label))
## [1] 0.9341357
#Sensitivity
Sensitivity(as.numeric(tree.pred2), as.numeric(test_total$label))
## [1] 0.9729269
predictions<- data.frame(predict(tree, newdata =test_total[,-3], type= "prob" ))
colnames(predictions) <- c(-1,1)
pred <- prediction(predictions$`1`, test_total$label)
perf <- performance(pred, "acc")
plot(perf, avg= "vertical", spread.estimate="boxplot",
show.spread.at= seq(0.1, 0.9, by=0.1),
main="Decision Tree cutoff and performance tradeoff")
index<-which.max(slot(perf, "y.values")[[1]])
max<-slot(perf, "x.values")[[1]][index]
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=T, print.cutoffs.at=max, text.adj=c(1,0), avg="threshold",
lwd=3,
main= "Roc curve for Decision Tree")
abline(a=0,b=1)
tree_importance<-as.data.frame(varImp(tree))
tree_importance$importance<-rownames(tree_importance)
tree_importance <- tree_importance %>% arrange(desc(Overall))
#CV method 2
cv_result_decision_tree_method2 <- CVgeneric("decision tree", c("y_coordinate", "x_coordinate", "NDAI", "SD", "CORR", "DF_angle", "CF_angle", "BF_angle", "AF_angle", 'AN_angle'), "label", K, data=method1_train_val, loss= accuracy, seed)
train_data_tree_2 = method1_train_val[-cv_result_decision_tree_method2$index, ]
tree2 = rpart(label ~ ., data=train_data_tree_2,maxdepth =10, minsplit= 10)
tree.pred_method_2 <- predict(tree2, newdata=test_total[,-3],type = 'class')
seed = 123
K = 5
#CV method 1
cv_result_rf <- CVgeneric("random forest", c("y_coordinate", "x_coordinate", "NDAI", "SD", "CORR", "DF_angle", "CF_angle", "BF_angle", "AF_angle", 'AN_angle'), "label", K, data=train_val, loss= accuracy, seed)
train_data_rf = train_val[-cv_result_rf$index, ]
#Choose the best hyperparameter of ntry which is the Number of variables randomly sampled as candidates at each split.
Random_forest_hyperparameter_tune<-tuneRF(x =train_data[,-3],
y = as.factor(train_data[,3]),
ntreeTry = 50)
## mtry = 3 OOB error = 0.46%
## Searching left ...
## mtry = 2 OOB error = 0.58%
## -0.2536585 0.05
## Searching right ...
## mtry = 6 OOB error = 0.36%
## 0.2097561 0.05
## mtry = 10 OOB error = 0.41%
## -0.117284 0.05
plot(data.frame(Random_forest_hyperparameter_tune)$mtry,
data.frame(Random_forest_hyperparameter_tune)$OOBError,
type = "l",ylab="OOBError",xlab="mtry",
main = "Random Forest mtry tune")
#Hyperparameter in R for random Forest
#ntree: Number of trees to grow.
rf_model <- randomForest(as.factor(label) ~ y_coordinate + x_coordinate +
NDAI + SD + CORR + DF_angle + CF_angle + BF_angle +
AF_angle + AN_angle, data = train_data_rf,
importance = TRUE,
ntree=50,
ntry=6,
maxnodes= 500,
nodesize = 10)
predicted.classes_rf <- rf_model %>% predict(test_total[,-3])
mean(predicted.classes_rf == test_total$label)
## [1] 0.9856546
#Other error metrics
#F1 Score
F1_Score(as.numeric(predicted.classes_rf), as.numeric(test_total$label))
## [1] 0.988177
#Sensitivity
Sensitivity(as.numeric(predicted.classes_rf), as.numeric(test_total$label))
## [1] 0.994301
predictions<- data.frame(predict(rf_model, newdata =test_total[,-3], type= "prob" ))
colnames(predictions) <- c(-1,1)
pred <- prediction(predictions$`1`, test_total$label)
perf <- performance(pred, "acc")
plot(perf, avg= "vertical", spread.estimate="boxplot",
show.spread.at= seq(0.1, 0.9, by=0.1),
main="Random Foret cutoff and performance tradeoff")
# The label 1 takes 40%, now I have the data set....
index<-which.max(slot(perf, "y.values")[[1]])
max<-slot(perf, "x.values")[[1]][index]
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=T, print.cutoffs.at=c(max), text.adj=c(0.5,0),
avg="threshold", lwd=3, main= "Roc curve for Random Forest")
abline(a=0,b=1)
varImp(rf_model)
## -1 1
## y_coordinate 23.939252 23.939252
## x_coordinate 27.422301 27.422301
## NDAI 26.462382 26.462382
## SD 8.316239 8.316239
## CORR 16.227750 16.227750
## DF_angle 11.040074 11.040074
## CF_angle 10.558653 10.558653
## BF_angle 8.243116 8.243116
## AF_angle 7.569437 7.569437
## AN_angle 10.725195 10.725195
#CV method 2
cv_result_rf_method2 <- CVgeneric("random forest",
c("y_coordinate", "x_coordinate", "NDAI", "SD",
"CORR", "DF_angle", "CF_angle", "BF_angle",
"AF_angle", 'AN_angle'),
"label", K, data=method1_train_val,
loss= accuracy,
seed)
#Print accuracy result
#cv_result_rf_method2
#train data by using the best training sets
train_data_rf2 = method1_train_val[-cv_result_rf_method2$index, ]
rf_model_method_2 <- randomForest(as.factor(label) ~ y_coordinate +
x_coordinate + NDAI + SD + CORR +
DF_angle + CF_angle + BF_angle +AF_angle +
AN_angle,
data = train_data_rf2,
importance = TRUE,ntree=50,
ntry=6,maxnodes= 500, nodesize = 10)
predicted.classes_rf_method_2 <- rf_model_method_2 %>% predict(test_total[,-3])
#Even though decision tree does not give the best train/test accuracy, it's a good method of classification due to its clearity in procudure.
#Even though the cross-validation accuracy showed that the there is no overfitting in the training set, we show the concept of tree prune here which is a method used in decisino tree to balance the cost complexity and model outcome.
#We can prune the tree using a cost complexity prune of 0.02
ggplot(data=tree_importance, aes(x=reorder(importance, -Overall), y=Overall)) +
geom_bar(stat="identity", color="black", fill="lightgreen")+
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
xlab("features")+
ylab("importance")+
ggtitle("Feature importance in Decision Tree")
fit <- rpart(label~.,
method="anova", data=train_val)
printcp(fit) # display the results
##
## Regression tree:
## rpart(formula = label ~ ., data = train_val, method = "anova")
##
## Variables actually used in tree construction:
## [1] AN_angle CORR NDAI x_coordinate
##
## Root node error: 39564/166443 = 0.23771
##
## n= 166443
##
## CP nsplit rel error xerror xstd
## 1 0.643034 0 1.00000 1.00001 0.0011149
## 2 0.034628 1 0.35697 0.35707 0.0022032
## 3 0.021174 2 0.32234 0.32249 0.0021659
## 4 0.020619 3 0.30116 0.30698 0.0020901
## 5 0.015749 4 0.28055 0.28105 0.0018821
## 6 0.010000 5 0.26480 0.26543 0.0019129
plotcp(fit)
summary(fit)
## Call:
## rpart(formula = label ~ ., data = train_val, method = "anova")
## n= 166443
##
## CP nsplit rel error xerror xstd
## 1 0.64303417 0 1.0000000 1.0000117 0.001114925
## 2 0.03462778 1 0.3569658 0.3570734 0.002203184
## 3 0.02117380 2 0.3223381 0.3224903 0.002165878
## 4 0.02061901 3 0.3011642 0.3069791 0.002090116
## 5 0.01574880 4 0.2805452 0.2810524 0.001882125
## 6 0.01000000 5 0.2647964 0.2654345 0.001912880
##
## Variable importance
## NDAI SD CORR AN_angle AF_angle
## 25 18 15 15 13
## BF_angle x_coordinate CF_angle DF_angle
## 12 1 1 1
##
## Node number 1: 166443 observations, complexity param=0.6430342
## mean=1.389118, MSE=0.2377052
## left son=2 (90754 obs) right son=3 (75689 obs)
## Primary splits:
## NDAI < 0.7947012 to the left, improve=0.6430342, (0 missing)
## SD < 3.080932 to the left, improve=0.4653290, (0 missing)
## CORR < 0.1966437 to the left, improve=0.4485266, (0 missing)
## x_coordinate < 229.5 to the right, improve=0.2647618, (0 missing)
## AN_angle < 177.6024 to the right, improve=0.2641968, (0 missing)
## Surrogate splits:
## SD < 3.134231 to the left, agree=0.868, adj=0.709, (0 split)
## CORR < 0.1949229 to the left, agree=0.807, adj=0.576, (0 split)
## AN_angle < 181.8939 to the right, agree=0.783, adj=0.522, (0 split)
## AF_angle < 186.9681 to the right, agree=0.763, adj=0.478, (0 split)
## BF_angle < 245.0755 to the right, agree=0.747, adj=0.443, (0 split)
##
## Node number 2: 90754 observations, complexity param=0.0211738
## mean=1.032076, MSE=0.03104687
## left son=4 (89546 obs) right son=5 (1208 obs)
## Primary splits:
## AN_angle < 172.2778 to the right, improve=0.2973169, (0 missing)
## AF_angle < 177.8793 to the right, improve=0.1943166, (0 missing)
## NDAI < 0.3929783 to the left, improve=0.1777455, (0 missing)
## CORR < 0.2075311 to the left, improve=0.1707469, (0 missing)
## BF_angle < 190.1277 to the right, improve=0.1378417, (0 missing)
## Surrogate splits:
## AF_angle < 178.8896 to the right, agree=0.996, adj=0.667, (0 split)
## BF_angle < 191.1332 to the right, agree=0.992, adj=0.410, (0 split)
## CF_angle < 197.68 to the right, agree=0.989, adj=0.205, (0 split)
## DF_angle < 209.0842 to the right, agree=0.988, adj=0.104, (0 split)
## y_coordinate < 21.5 to the right, agree=0.988, adj=0.065, (0 split)
##
## Node number 3: 75689 observations, complexity param=0.03462778
## mean=1.817226, MSE=0.1493678
## left son=6 (6902 obs) right son=7 (68787 obs)
## Primary splits:
## x_coordinate < 295.5 to the right, improve=0.12118230, (0 missing)
## CORR < 0.1938472 to the left, improve=0.11668150, (0 missing)
## y_coordinate < 45.5 to the left, improve=0.07704374, (0 missing)
## DF_angle < 261.4637 to the left, improve=0.06366120, (0 missing)
## SD < 2.742566 to the left, improve=0.05255161, (0 missing)
## Surrogate splits:
## AN_angle < 263.4902 to the right, agree=0.909, adj=0.003, (0 split)
## AF_angle < 277.8265 to the right, agree=0.909, adj=0.003, (0 split)
## CORR < -0.3280679 to the left, agree=0.909, adj=0.000, (0 split)
## BF_angle < 300.493 to the right, agree=0.909, adj=0.000, (0 split)
##
## Node number 4: 89546 observations
## mean=1.020917, MSE=0.02047912
##
## Node number 5: 1208 observations
## mean=1.859272, MSE=0.120924
##
## Node number 6: 6902 observations, complexity param=0.0157488
## mean=1.392495, MSE=0.2384427
## left son=12 (4542 obs) right son=13 (2360 obs)
## Primary splits:
## AN_angle < 195.8497 to the left, improve=0.3786106, (0 missing)
## AF_angle < 211.1927 to the left, improve=0.3404610, (0 missing)
## BF_angle < 230.3768 to the left, improve=0.3158916, (0 missing)
## y_coordinate < 66.5 to the left, improve=0.2906732, (0 missing)
## CF_angle < 249.0817 to the left, improve=0.2870195, (0 missing)
## Surrogate splits:
## AF_angle < 209.1577 to the left, agree=0.946, adj=0.841, (0 split)
## BF_angle < 230.3455 to the left, agree=0.908, adj=0.731, (0 split)
## CF_angle < 249.1973 to the left, agree=0.871, adj=0.622, (0 split)
## y_coordinate < 77.5 to the left, agree=0.843, adj=0.540, (0 split)
## DF_angle < 264.0044 to the left, agree=0.841, adj=0.536, (0 split)
##
## Node number 7: 68787 observations, complexity param=0.02061901
## mean=1.859843, MSE=0.1205132
## left son=14 (24633 obs) right son=15 (44154 obs)
## Primary splits:
## CORR < 0.1990036 to the left, improve=0.09840812, (0 missing)
## SD < 2.615267 to the left, improve=0.04340106, (0 missing)
## DF_angle < 292.9841 to the left, improve=0.04142987, (0 missing)
## AF_angle < 227.3604 to the right, improve=0.04092657, (0 missing)
## NDAI < 3.27511 to the right, improve=0.03594479, (0 missing)
## Surrogate splits:
## AF_angle < 224.184 to the right, agree=0.716, adj=0.207, (0 split)
## AN_angle < 206.0282 to the right, agree=0.716, adj=0.206, (0 split)
## SD < 3.828523 to the left, agree=0.703, adj=0.170, (0 split)
## DF_angle < 280.5912 to the left, agree=0.692, adj=0.141, (0 split)
## y_coordinate < 276.5 to the right, agree=0.685, adj=0.120, (0 split)
##
## Node number 12: 4542 observations
## mean=1.175914, MSE=0.1449681
##
## Node number 13: 2360 observations
## mean=1.809322, MSE=0.1543199
##
## Node number 14: 24633 observations
## mean=1.714042, MSE=0.204186
##
## Node number 15: 44154 observations
## mean=1.941183, MSE=0.05535744
rsq.rpart(fit)
##
## Regression tree:
## rpart(formula = label ~ ., data = train_val, method = "anova")
##
## Variables actually used in tree construction:
## [1] AN_angle CORR NDAI x_coordinate
##
## Root node error: 39564/166443 = 0.23771
##
## n= 166443
##
## CP nsplit rel error xerror xstd
## 1 0.643034 0 1.00000 1.00001 0.0011149
## 2 0.034628 1 0.35697 0.35707 0.0022032
## 3 0.021174 2 0.32234 0.32249 0.0021659
## 4 0.020619 3 0.30116 0.30698 0.0020901
## 5 0.015749 4 0.28055 0.28105 0.0018821
## 6 0.010000 5 0.26480 0.26543 0.0019129
#refer section 4d
adaboost<- adaboost(label~y_coordinate + x_coordinate + NDAI + SD + CORR + DF_angle + CF_angle+ BF_angle + AF_angle + AN_angle ,data=train_val, nIter=10)
predictions<- data.frame(predict(adaboost, newdata =test_total[,-3] )$prob)
colnames(predictions) <- c(-1,1)
mean(predict(adaboost, newdata =test_total[,-3] )$class == test_total$label)
## [1] 0.9977893
pred <- prediction(predictions$`1`, test_total$label)
perf <- performance(pred, "acc")
plot(perf, avg= "vertical", spread.estimate="boxplot", show.spread.at= seq(0.1, 0.9, by=0.1), main="Adaboosting cutoff and performance tradeoff")
# The label 1 takes 40%, now I have the data set....
index<-which.max(slot(perf, "y.values")[[1]])
max<-slot(perf, "x.values")[[1]][index]
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=T, print.cutoffs.at=c(max), text.adj=c(0.3,0), avg="threshold", lwd=3, main= "Adaboosting curve for ROC")
abline(a=0,b=1)
#Misclassification pattern for random forest
mis_label_random_forest<- train_data_rf[predicted.classes_rf!=test_total$label,]
ggplot(mis_label_random_forest)+ geom_point(alpha = 0.5, aes(x= x_coordinate, y = y_coordinate, color = factor(label)))+
xlab(" x coordinate") + ylab(" y coordinate") + ggtitle("Mis-classification in random-forest model split method 1 ")+ theme_bw()
Predicted <-predicted.classes_rf
Actual<- test_total$label
fourfoldplot(table(Predicted, Actual))
#Method 2
mis_label_random_forest<- train_data_rf2[predicted.classes_rf_method_2!=test_total$label,]
ggplot(mis_label_random_forest)+ geom_point(alpha = 0.5, aes(x= x_coordinate, y = y_coordinate, color = factor(label)))+
xlab(" x coordinate") + ylab(" y coordinate") + ggtitle("Mis-classification in random-forest model split method 2 ")+ theme_bw()
Predicted <-predicted.classes_rf_method_2
fourfoldplot(table(Predicted, Actual))
#Misclassification pattern for decision tree
mis_label_decision_tree<- train_data[tree.pred2!=test_total$label,]
ggplot(mis_label_decision_tree)+ geom_point(alpha = 0.5, aes(x= x_coordinate, y = y_coordinate, color = factor(label)))+
xlab(" x coordinate") + ylab(" y coordinate") + ggtitle("Mis-classification in decision tree model split method 1 ")+ theme_bw()
Predicted <-tree.pred2
fourfoldplot(table(Predicted, Actual))
#Method 2
mis_label_decision_tree2<- train_data_tree_2[tree.pred_method_2!=test_total$label,]
ggplot(mis_label_decision_tree2)+ geom_point(alpha = 0.5, aes(x= x_coordinate, y = y_coordinate, color = factor(label)))+
xlab(" x coordinate") + ylab(" y coordinate") + ggtitle("Mis-classification in decision tree model split method 2 ")+ theme_bw()
Predicted <-tree.pred_method_2
fourfoldplot(table(Predicted, Actual))